/*====================================================================*
 -  Copyright (C) 2001 Leptonica.  All rights reserved.
 -
 -  Redistribution and use in source and binary forms, with or without
 -  modification, are permitted provided that the following conditions
 -  are met:
 -  1. Redistributions of source code must retain the above copyright
 -     notice, this list of conditions and the following disclaimer.
 -  2. Redistributions in binary form must reproduce the above
 -     copyright notice, this list of conditions and the following
 -     disclaimer in the documentation and/or other materials
 -     provided with the distribution.
 -
 -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
 -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *====================================================================*/

/*!
 * \file  sarray1.c
 * <pre>
 *
 *      Create/Destroy/Copy
 *          SARRAY    *sarrayCreate()
 *          SARRAY    *sarrayCreateInitialized()
 *          SARRAY    *sarrayCreateWordsFromString()
 *          SARRAY    *sarrayCreateLinesFromString()
 *          void      *sarrayDestroy()
 *          SARRAY    *sarrayCopy()
 *          SARRAY    *sarrayClone()
 *
 *      Add/Remove string
 *          l_int32    sarrayAddString()
 *          static l_int32  sarrayExtendArray()
 *          char      *sarrayRemoveString()
 *          l_int32    sarrayReplaceString()
 *          l_int32    sarrayClear()
 *
 *      Accessors
 *          l_int32    sarrayGetCount()
 *          char     **sarrayGetArray()
 *          char      *sarrayGetString()
 *          l_int32    sarrayGetRefcount()
 *          l_int32    sarrayChangeRefcount()
 *
 *      Conversion back to string
 *          char      *sarrayToString()
 *          char      *sarrayToStringRange()
 *
 *      Concatenate strings uniformly within the sarray
 *          SARRAY    *sarrayConcatUniformly()
 *
 *      Join 2 sarrays
 *          l_int32    sarrayJoin()
 *          l_int32    sarrayAppendRange()
 *
 *      Pad an sarray to be the same size as another sarray
 *          l_int32    sarrayPadToSameSize()
 *
 *      Convert word sarray to (formatted) line sarray
 *          SARRAY    *sarrayConvertWordsToLines()
 *
 *      Split string on separator list
 *          SARRAY    *sarraySplitString()
 *
 *      Filter sarray
 *          SARRAY    *sarraySelectBySubstring()
 *          SARRAY    *sarraySelectRange()
 *          l_int32    sarrayParseRange()
 *
 *      Serialize for I/O
 *          SARRAY    *sarrayRead()
 *          SARRAY    *sarrayReadStream()
 *          SARRAY    *sarrayReadMem()
 *          l_int32    sarrayWrite()
 *          l_int32    sarrayWriteStream()
 *          l_int32    sarrayWriteStderr()
 *          l_int32    sarrayWriteMem()
 *          l_int32    sarrayAppend()
 *
 *      Directory filenames
 *          SARRAY    *getNumberedPathnamesInDirectory()
 *          SARRAY    *getSortedPathnamesInDirectory()
 *          SARRAY    *convertSortedToNumberedPathnames()
 *          SARRAY    *getFilenamesInDirectory()
 *
 *      These functions are important for efficient manipulation
 *      of string data, and they have found widespread use in
 *      leptonica.  For example:
 *         (1) to generate text files: e.g., PostScript and PDF
 *             wrappers around sets of images
 *         (2) to parse text files: e.g., extracting prototypes
 *             from the source to generate allheaders.h
 *         (3) to generate code for compilation: e.g., the fast
 *             dwa code for arbitrary structuring elements.
 *
 *      Comments on usage:
 *
 *          The user is responsible for correctly disposing of strings
 *          that have been extracted from sarrays.  In the following,
 *          "str_not_owned" means the returned handle does not own the string,
 *          and "str_owned" means the returned handle owns the string.
 *            - To extract a string from an Sarray in order to inspect it
 *              or to make a copy of it later, get a handle to it:
 *                  copyflag = L_NOCOPY.
 *              In this case, you must neither free the string nor put it
 *              directly in another array:
 *                 str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
 *            - To extract a copy of a string from an Sarray, use:
 *                 str-owned = sarrayGetString(sa, index, L_COPY);
 *            ~ To insert a string that is in one array into another
 *              array (always leaving the first array intact), there are
 *              two options:
 *                 (1) use copyflag = L_COPY to make an immediate copy,
 *                     which you then add to the second array by insertion:
 *                       str-owned = sarrayGetString(sa, index, L_COPY);
 *                       sarrayAddString(sa, str-owned, L_INSERT);
 *                 (2) use copyflag = L_NOCOPY to get another handle to
 *                     the string; you then add a copy of it to the
 *                     second string array:
 *                       str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
 *                       sarrayAddString(sa, str-not-owned, L_COPY).
 *              sarrayAddString() transfers ownership to the Sarray, so never
 *              use L_INSERT if the string is owned by another array.
 *
 *              In all cases, when you use copyflag = L_COPY to extract
 *              a string from an array, you must either free it
 *              or insert it in an array that will be freed later.
 * </pre>
 */

#ifdef HAVE_CONFIG_H
#include <config_auto.h>
#endif  /* HAVE_CONFIG_H */

#include <string.h>
#ifndef _WIN32
#include <dirent.h>     /* unix only */
#include <sys/stat.h>
#include <limits.h>  /* needed for realpath() */
#include <stdlib.h>  /* needed for realpath() */
#endif  /* ! _WIN32 */
#include "allheaders.h"

static const l_uint32  MaxPtrArraySize = 50000000;    /* 50 million */
static const l_int32   InitialPtrArraySize = 50;      /*!< n'importe quoi */

    /* Static functions */
static l_int32 sarrayExtendArray(SARRAY *sa);


/*--------------------------------------------------------------------------*
 *                   String array create/destroy/copy/extend                *
 *--------------------------------------------------------------------------*/
/*!
 * \brief   sarrayCreate()
 *
 * \param[in]    n    size of string ptr array to be alloc'd; use 0 for default
 * \return  sarray, or NULL on error
 */
SARRAY *
sarrayCreate(l_int32  n)
{
SARRAY  *sa;

    PROCNAME("sarrayCreate");

    if (n <= 0 || n > MaxPtrArraySize)
        n = InitialPtrArraySize;

    sa = (SARRAY *)LEPT_CALLOC(1, sizeof(SARRAY));
    if ((sa->array = (char **)LEPT_CALLOC(n, sizeof(char *))) == NULL) {
        sarrayDestroy(&sa);
        return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL);
    }

    sa->nalloc = n;
    sa->n = 0;
    sa->refcount = 1;
    return sa;
}


/*!
 * \brief   sarrayCreateInitialized()
 *
 * \param[in]    n         size of string ptr array to be alloc'd
 * \param[in]    initstr   string to be initialized on the full array
 * \return  sarray, or NULL on error
 */
SARRAY *
sarrayCreateInitialized(l_int32      n,
                        const char  *initstr)
{
l_int32  i;
SARRAY  *sa;

    PROCNAME("sarrayCreateInitialized");

    if (n <= 0)
        return (SARRAY *)ERROR_PTR("n must be > 0", procName, NULL);
    if (!initstr)
        return (SARRAY *)ERROR_PTR("initstr not defined", procName, NULL);

    sa = sarrayCreate(n);
    for (i = 0; i < n; i++)
        sarrayAddString(sa, initstr, L_COPY);
    return sa;
}


/*!
 * \brief   sarrayCreateWordsFromString()
 *
 * \param[in]    string
 * \return  sarray, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This finds the number of word substrings, creates an sarray
 *          of this size, and puts copies of each substring into the sarray.
 * </pre>
 */
SARRAY *
sarrayCreateWordsFromString(const char  *string)
{
char     separators[] = " \n\t";
l_int32  i, nsub, size, inword;
SARRAY  *sa;

    PROCNAME("sarrayCreateWordsFromString");

    if (!string)
        return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);

        /* Find the number of words */
    size = strlen(string);
    nsub = 0;
    inword = FALSE;
    for (i = 0; i < size; i++) {
        if (inword == FALSE &&
           (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
           inword = TRUE;
           nsub++;
        } else if (inword == TRUE &&
           (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
           inword = FALSE;
        }
    }

    if ((sa = sarrayCreate(nsub)) == NULL)
        return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
    sarraySplitString(sa, string, separators);

    return sa;
}


/*!
 * \brief   sarrayCreateLinesFromString()
 *
 * \param[in]    string
 * \param[in]    blankflag    0 to exclude blank lines; 1 to include
 * \return  sarray, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This finds the number of line substrings, each of which
 *          ends with a newline, and puts a copy of each substring
 *          in a new sarray.
 *      (2) The newline characters are removed from each substring.
 * </pre>
 */
SARRAY *
sarrayCreateLinesFromString(const char  *string,
                            l_int32      blankflag)
{
l_int32  i, nsub, size, startptr;
char    *cstring, *substring;
SARRAY  *sa;

    PROCNAME("sarrayCreateLinesFromString");

    if (!string)
        return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL);

        /* Find the number of lines */
    size = strlen(string);
    nsub = 0;
    for (i = 0; i < size; i++) {
        if (string[i] == '\n')
            nsub++;
    }

    if ((sa = sarrayCreate(nsub)) == NULL)
        return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);

    if (blankflag) {  /* keep blank lines as null strings */
            /* Make a copy for munging */
        if ((cstring = stringNew(string)) == NULL) {
            sarrayDestroy(&sa);
            return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL);
        }
            /* We'll insert nulls like strtok */
        startptr = 0;
        for (i = 0; i < size; i++) {
            if (cstring[i] == '\n') {
                cstring[i] = '\0';
                if (i > 0 && cstring[i - 1] == '\r')
                    cstring[i - 1] = '\0';  /* also remove Windows CR */
                if ((substring = stringNew(cstring + startptr)) == NULL) {
                    sarrayDestroy(&sa);
                    LEPT_FREE(cstring);
                    return (SARRAY *)ERROR_PTR("substring not made",
                                                procName, NULL);
                }
                sarrayAddString(sa, substring, L_INSERT);
/*                lept_stderr("substring = %s\n", substring); */
                startptr = i + 1;
            }
        }
        if (startptr < size) {  /* no newline at end of last line */
            if ((substring = stringNew(cstring + startptr)) == NULL) {
                sarrayDestroy(&sa);
                LEPT_FREE(cstring);
                return (SARRAY *)ERROR_PTR("substring not made",
                                           procName, NULL);
            }
            sarrayAddString(sa, substring, L_INSERT);
/*            lept_stderr("substring = %s\n", substring); */
        }
        LEPT_FREE(cstring);
    } else {  /* remove blank lines; use strtok */
        sarraySplitString(sa, string, "\r\n");
    }

    return sa;
}


/*!
 * \brief   sarrayDestroy()
 *
 * \param[in,out]   psa    will be set to null before returning
 * \return  void
 *
 * <pre>
 * Notes:
 *      (1) Decrements the ref count and, if 0, destroys the sarray.
 *      (2) Always nulls the input ptr.
 * </pre>
 */
void
sarrayDestroy(SARRAY  **psa)
{
l_int32  i;
SARRAY  *sa;

    PROCNAME("sarrayDestroy");

    if (psa == NULL) {
        L_WARNING("ptr address is NULL!\n", procName);
        return;
    }
    if ((sa = *psa) == NULL)
        return;

    sarrayChangeRefcount(sa, -1);
    if (sarrayGetRefcount(sa) <= 0) {
        if (sa->array) {
            for (i = 0; i < sa->n; i++) {
                if (sa->array[i])
                    LEPT_FREE(sa->array[i]);
            }
            LEPT_FREE(sa->array);
        }
        LEPT_FREE(sa);
    }
    *psa = NULL;
}


/*!
 * \brief   sarrayCopy()
 *
 * \param[in]    sa    string array
 * \return  copy of sarray, or NULL on error
 */
SARRAY *
sarrayCopy(SARRAY  *sa)
{
l_int32  i;
SARRAY  *csa;

    PROCNAME("sarrayCopy");

    if (!sa)
        return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);

    if ((csa = sarrayCreate(sa->nalloc)) == NULL)
        return (SARRAY *)ERROR_PTR("csa not made", procName, NULL);

    for (i = 0; i < sa->n; i++)
        sarrayAddString(csa, sa->array[i], L_COPY);

    return csa;
}


/*!
 * \brief   sarrayClone()
 *
 * \param[in]    sa    string array
 * \return  ptr to same sarray, or NULL on error
 */
SARRAY *
sarrayClone(SARRAY  *sa)
{
    PROCNAME("sarrayClone");

    if (!sa)
        return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
    sarrayChangeRefcount(sa, 1);
    return sa;
}


/*!
 * \brief   sarrayAddString()
 *
 * \param[in]    sa         string array
 * \param[in]    string     string to be added
 * \param[in]    copyflag   L_INSERT, L_NOCOPY or L_COPY
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) See usage comments at the top of this file.  L_INSERT is
 *          equivalent to L_NOCOPY.
 * </pre>
 */
l_ok
sarrayAddString(SARRAY      *sa,
                const char  *string,
                l_int32      copyflag)
{
l_int32  n;

    PROCNAME("sarrayAddString");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!string)
        return ERROR_INT("string not defined", procName, 1);
    if (copyflag != L_INSERT && copyflag != L_NOCOPY && copyflag != L_COPY)
        return ERROR_INT("invalid copyflag", procName, 1);

    n = sarrayGetCount(sa);
    if (n >= sa->nalloc) {
        if (sarrayExtendArray(sa))
            return ERROR_INT("extension failed", procName, 1);
    }

    if (copyflag == L_COPY)
        sa->array[n] = stringNew(string);
    else  /* L_INSERT or L_NOCOPY */
        sa->array[n] = (char *)string;
    sa->n++;
    return 0;
}


/*!
 * \brief   sarrayExtendArray()
 *
 * \param[in]    sa    string array
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Doubles the size of the string ptr array.
 *      (2) The max number of strings is 50M.
 * </pre>
 */
static l_int32
sarrayExtendArray(SARRAY  *sa)
{
size_t  oldsize, newsize;

    PROCNAME("sarrayExtendArray");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (sa->nalloc >= MaxPtrArraySize)
        return ERROR_INT("sa at maximum ptr size; can't extend", procName, 1);
    oldsize = sa->nalloc * sizeof(char *);
    if (sa->nalloc > MaxPtrArraySize / 2) {
        newsize = MaxPtrArraySize * sizeof(char *);
        sa->nalloc = MaxPtrArraySize;
    } else {
        newsize = 2 * oldsize;
        sa->nalloc *= 2;
    }
    if ((sa->array = (char **)reallocNew((void **)&sa->array,
                                         oldsize, newsize)) == NULL)
        return ERROR_INT("new ptr array not returned", procName, 1);

    return 0;
}


/*!
 * \brief   sarrayRemoveString()
 *
 * \param[in]    sa       string array
 * \param[in]    index    of string within sarray
 * \return  removed string, or NULL on error
 */
char *
sarrayRemoveString(SARRAY  *sa,
                   l_int32  index)
{
char    *string;
char   **array;
l_int32  i, n, nalloc;

    PROCNAME("sarrayRemoveString");

    if (!sa)
        return (char *)ERROR_PTR("sa not defined", procName, NULL);

    if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
        return (char *)ERROR_PTR("array not returned", procName, NULL);

    if (index < 0 || index >= n)
        return (char *)ERROR_PTR("array index out of bounds", procName, NULL);

    string = array[index];

        /* If removed string is not at end of array, shift
         * to fill in, maintaining original ordering.
         * Note: if we didn't care about the order, we could
         * put the last string array[n - 1] directly into the hole.  */
    for (i = index; i < n - 1; i++)
        array[i] = array[i + 1];

    sa->n--;
    return string;
}


/*!
 * \brief   sarrayReplaceString()
 *
 * \param[in]    sa         string array
 * \param[in]    index      of string within sarray to be replaced
 * \param[in]    newstr     string to replace existing one
 * \param[in]    copyflag   L_INSERT, L_COPY
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This destroys an existing string and replaces it with
 *          the new string or a copy of it.
 *      (2) By design, an sarray is always compacted, so there are
 *          never any holes (null ptrs) in the ptr array up to the
 *          current count.
 * </pre>
 */
l_ok
sarrayReplaceString(SARRAY  *sa,
                    l_int32  index,
                    char    *newstr,
                    l_int32  copyflag)
{
char    *str;
l_int32  n;

    PROCNAME("sarrayReplaceString");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    n = sarrayGetCount(sa);
    if (index < 0 || index >= n)
        return ERROR_INT("array index out of bounds", procName, 1);
    if (!newstr)
        return ERROR_INT("newstr not defined", procName, 1);
    if (copyflag != L_INSERT && copyflag != L_COPY)
        return ERROR_INT("invalid copyflag", procName, 1);

    LEPT_FREE(sa->array[index]);
    if (copyflag == L_INSERT)
        str = newstr;
    else  /* L_COPY */
        str = stringNew(newstr);
    sa->array[index] = str;
    return 0;
}


/*!
 * \brief   sarrayClear()
 *
 * \param[in]    sa    string array
 * \return  0 if OK; 1 on error
 */
l_ok
sarrayClear(SARRAY  *sa)
{
l_int32  i;

    PROCNAME("sarrayClear");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    for (i = 0; i < sa->n; i++) {  /* free strings and null ptrs */
        LEPT_FREE(sa->array[i]);
        sa->array[i] = NULL;
    }
    sa->n = 0;
    return 0;
}


/*----------------------------------------------------------------------*
 *                               Accessors                              *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarrayGetCount()
 *
 * \param[in]    sa    string array
 * \return  count, or 0 if no strings or on error
 */
l_int32
sarrayGetCount(SARRAY  *sa)
{
    PROCNAME("sarrayGetCount");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 0);
    return sa->n;
}


/*!
 * \brief   sarrayGetArray()
 *
 * \param[in]    sa        string array
 * \param[out]   pnalloc   [optional] number allocated string ptrs
 * \param[out]   pn        [optional] number allocated strings
 * \return  ptr to string array, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Caution: the returned array is not a copy, so caller
 *          must not destroy it!
 * </pre>
 */
char **
sarrayGetArray(SARRAY   *sa,
               l_int32  *pnalloc,
               l_int32  *pn)
{
char  **array;

    PROCNAME("sarrayGetArray");

    if (!sa)
        return (char **)ERROR_PTR("sa not defined", procName, NULL);

    array = sa->array;
    if (pnalloc) *pnalloc = sa->nalloc;
    if (pn) *pn = sa->n;

    return array;
}


/*!
 * \brief   sarrayGetString()
 *
 * \param[in]    sa         string array
 * \param[in]    index      to the index-th string
 * \param[in]    copyflag   L_NOCOPY or L_COPY
 * \return  string, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) See usage comments at the top of this file.
 *      (2) To get a pointer to the string itself, use L_NOCOPY.
 *          To get a copy of the string, use L_COPY.
 * </pre>
 */
char *
sarrayGetString(SARRAY  *sa,
                l_int32  index,
                l_int32  copyflag)
{
    PROCNAME("sarrayGetString");

    if (!sa)
        return (char *)ERROR_PTR("sa not defined", procName, NULL);
    if (index < 0 || index >= sa->n)
        return (char *)ERROR_PTR("index not valid", procName, NULL);
    if (copyflag != L_NOCOPY && copyflag != L_COPY)
        return (char *)ERROR_PTR("invalid copyflag", procName, NULL);

    if (copyflag == L_NOCOPY)
        return sa->array[index];
    else  /* L_COPY */
        return stringNew(sa->array[index]);
}


/*!
 * \brief   sarrayGetRefCount()
 *
 * \param[in]    sa     string array
 * \return  refcount, or UNDEF on error
 */
l_int32
sarrayGetRefcount(SARRAY  *sa)
{
    PROCNAME("sarrayGetRefcount");

    if (!sa)
        return ERROR_INT("sa not defined", procName, UNDEF);
    return sa->refcount;
}


/*!
 * \brief   sarrayChangeRefCount()
 *
 * \param[in]    sa      string array
 * \param[in]    delta   change to be applied
 * \return  0 if OK, 1 on error
 */
l_ok
sarrayChangeRefcount(SARRAY  *sa,
                     l_int32  delta)
{
    PROCNAME("sarrayChangeRefcount");

    if (!sa)
        return ERROR_INT("sa not defined", procName, UNDEF);
    sa->refcount += delta;
    return 0;
}


/*----------------------------------------------------------------------*
 *                      Conversion to string                           *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarrayToString()
 *
 * \param[in]    sa          string array
 * \param[in]    addnlflag   flag: 0 adds nothing to each substring
 *                                 1 adds '\n' to each substring
 *                                 2 adds ' ' to each substring
 *                                 3 adds ',' to each substring
 * \return  dest string, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Concatenates all the strings in the sarray, preserving
 *          all white space.
 *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
 *      (3) This function was NOT implemented as:
 *            for (i = 0; i < n; i++)
 *                strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
 *          Do you see why?
 * </pre>
 */
char *
sarrayToString(SARRAY  *sa,
               l_int32  addnlflag)
{
    PROCNAME("sarrayToString");

    if (!sa)
        return (char *)ERROR_PTR("sa not defined", procName, NULL);

    return sarrayToStringRange(sa, 0, 0, addnlflag);
}


/*!
 * \brief   sarrayToStringRange()
 *
 * \param[in]   sa          string array
 * \param[in]   first       index of first string to use; starts with 0
 * \param[in]   nstrings    number of strings to append into the result; use
 *                          0 to append to the end of the sarray
 * \param[in]   addnlflag   flag: 0 adds nothing to each substring
 *                                1 adds '\n' to each substring
 *                                2 adds ' ' to each substring
 *                                3 adds ',' to each substring
 * \return  dest string, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Concatenates the specified strings in the sarray, preserving
 *          all white space.
 *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
 *      (3) If the sarray is empty, this returns a string with just
 *          the character corresponding to %addnlflag.
 * </pre>
 */
char *
sarrayToStringRange(SARRAY  *sa,
                    l_int32  first,
                    l_int32  nstrings,
                    l_int32  addnlflag)
{
char    *dest, *src, *str;
l_int32  n, i, last, size, index, len;

    PROCNAME("sarrayToStringRange");

    if (!sa)
        return (char *)ERROR_PTR("sa not defined", procName, NULL);
    if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3)
        return (char *)ERROR_PTR("invalid addnlflag", procName, NULL);

    n = sarrayGetCount(sa);

        /* Empty sa; return char corresponding to addnlflag only */
    if (n == 0) {
        if (first == 0) {
            if (addnlflag == 0)
                return stringNew("");
            if (addnlflag == 1)
                return stringNew("\n");
            if (addnlflag == 2)
                return stringNew(" ");
            else  /* addnlflag == 3) */
                return stringNew(",");
        } else {
            return (char *)ERROR_PTR("first not valid", procName, NULL);
        }
    }

        /* Determine the range of string indices to be used */
    if (first < 0 || first >= n)
        return (char *)ERROR_PTR("first not valid", procName, NULL);
    if (nstrings == 0 || (nstrings > n - first))
        nstrings = n - first;  /* no overflow */
    last = first + nstrings - 1;

        /* Determine the size of the output string */
    size = 0;
    for (i = first; i <= last; i++) {
        if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
            return (char *)ERROR_PTR("str not found", procName, NULL);
        size += strlen(str) + 2;
    }
    if ((dest = (char *)LEPT_CALLOC(size + 1, sizeof(char))) == NULL)
        return (char *)ERROR_PTR("dest not made", procName, NULL);

        /* Construct the output */
    index = 0;
    for (i = first; i <= last; i++) {
        src = sarrayGetString(sa, i, L_NOCOPY);
        len = strlen(src);
        memcpy(dest + index, src, len);
        index += len;
        if (addnlflag == 1) {
            dest[index] = '\n';
            index++;
        } else if (addnlflag == 2) {
            dest[index] = ' ';
            index++;
        } else if (addnlflag == 3) {
            dest[index] = ',';
            index++;
        }
    }

    return dest;
}


/*----------------------------------------------------------------------*
 *           Concatenate strings uniformly within the sarray            *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarrayConcatUniformly()
 *
 * \param[in]    sa          string array
 * \param[in]    n           number of strings in output sarray
 * \param[in]    addnlflag   flag: 0 adds nothing to each substring
 *                                 1 adds '\n' to each substring
 *                                 2 adds ' ' to each substring
 *                                 3 adds ',' to each substring
 * \return  dest sarray, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Divides %sa into %n essentially equal sets of strings,
 *          concatenates each set individually, and makes an output
 *          sarray with the %n concatenations.  %n must not exceed the
 *          number of strings in %sa.
 *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
 * </pre>
 */
SARRAY *
sarrayConcatUniformly(SARRAY  *sa,
                      l_int32  n,
                      l_int32  addnlflag)
{
l_int32  i, first, ntot, nstr;
char    *str;
NUMA    *na;
SARRAY  *saout;

    PROCNAME("sarrayConcatUniformly");

    if (!sa)
        return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
    ntot = sarrayGetCount(sa);
    if (n < 1)
        return (SARRAY *)ERROR_PTR("n must be >= 1", procName, NULL);
    if (n > ntot) {
        L_ERROR("n = %d > ntot = %d\n", procName, n, ntot);
        return NULL;
    }
    if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3)
        return (SARRAY *)ERROR_PTR("invalid addnlflag", procName, NULL);

    saout = sarrayCreate(0);
    na = numaGetUniformBinSizes(ntot, n);
    for (i = 0, first = 0; i < n; i++) {
        numaGetIValue(na, i, &nstr);
        str = sarrayToStringRange(sa, first, nstr, addnlflag);
        sarrayAddString(saout, str, L_INSERT);
        first += nstr;
    }
    numaDestroy(&na);
    return saout;
}


/*----------------------------------------------------------------------*
 *                           Join 2 sarrays                             *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarrayJoin()
 *
 * \param[in]    sa1   to be added to
 * \param[in]    sa2   append to sa1
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Copies of the strings in sarray2 are added to sarray1.
 * </pre>
 */
l_ok
sarrayJoin(SARRAY  *sa1,
           SARRAY  *sa2)
{
char    *str;
l_int32  n, i;

    PROCNAME("sarrayJoin");

    if (!sa1)
        return ERROR_INT("sa1 not defined", procName, 1);
    if (!sa2)
        return ERROR_INT("sa2 not defined", procName, 1);

    n = sarrayGetCount(sa2);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sa2, i, L_NOCOPY);
        if (sarrayAddString(sa1, str, L_COPY) == 1) {
            L_ERROR("failed to add string at i = %d\n", procName, i);
            return 1;
        }
    }
    return 0;
}


/*!
 * \brief   sarrayAppendRange()
 *
 * \param[in]    sa1     to be added to
 * \param[in]    sa2     append specified range of strings in sa2 to sa1
 * \param[in]    start   index of first string of sa2 to append
 * \param[in]    end     index of last string of sa2 to append;
 *                       -1 to append to end of array
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Copies of the strings in sarray2 are added to sarray1.
 *      (2) The [start ... end] range is truncated if necessary.
 *      (3) Use end == -1 to append to the end of sa2.
 * </pre>
 */
l_ok
sarrayAppendRange(SARRAY  *sa1,
                  SARRAY  *sa2,
                  l_int32  start,
                  l_int32  end)
{
char    *str;
l_int32  n, i;

    PROCNAME("sarrayAppendRange");

    if (!sa1)
        return ERROR_INT("sa1 not defined", procName, 1);
    if (!sa2)
        return ERROR_INT("sa2 not defined", procName, 1);

    if (start < 0)
        start = 0;
    n = sarrayGetCount(sa2);
    if (end < 0 || end >= n)
        end = n - 1;
    if (start > end)
        return ERROR_INT("start > end", procName, 1);

    for (i = start; i <= end; i++) {
        str = sarrayGetString(sa2, i, L_NOCOPY);
        sarrayAddString(sa1, str, L_COPY);
    }

    return 0;
}


/*----------------------------------------------------------------------*
 *          Pad an sarray to be the same size as another sarray         *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarrayPadToSameSize()
 *
 * \param[in]    sa1, sa2
 * \param[in]    padstring
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) If two sarrays have different size, this adds enough
 *          instances of %padstring to the smaller so that they are
 *          the same size.  It is useful when two or more sarrays
 *          are being sequenced in parallel, and it is necessary to
 *          find a valid string at each index.
 * </pre>
 */
l_ok
sarrayPadToSameSize(SARRAY      *sa1,
                    SARRAY      *sa2,
                    const char  *padstring)
{
l_int32  i, n1, n2;

    PROCNAME("sarrayPadToSameSize");

    if (!sa1 || !sa2)
        return ERROR_INT("both sa1 and sa2 not defined", procName, 1);

    n1 = sarrayGetCount(sa1);
    n2 = sarrayGetCount(sa2);
    if (n1 < n2) {
        for (i = n1; i < n2; i++)
            sarrayAddString(sa1, padstring, L_COPY);
    } else if (n1 > n2) {
        for (i = n2; i < n1; i++)
            sarrayAddString(sa2, padstring, L_COPY);
    }

    return 0;
}


/*----------------------------------------------------------------------*
 *                   Convert word sarray to line sarray                 *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarrayConvertWordsToLines()
 *
 * \param[in]    sa  sa      of individual words
 * \param[in]    linesize    max num of chars in each line
 * \return  saout sa of formatted lines, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This is useful for re-typesetting text to a specific maximum
 *          line length.  The individual words in the input sarray
 *          are concatenated into textlines.  An input word string of zero
 *          length is taken to be a paragraph separator.  Each time
 *          such a string is found, the current line is ended and
 *          a new line is also produced that contains just the
 *          string of zero length "".  When the output sarray
 *          of lines is eventually converted to a string with newlines
 *          typically appended to each line string, the empty
 *          strings are just converted to newlines, producing the visible
 *          paragraph separation.
 *      (2) What happens when a word is larger than linesize?
 *          We write it out as a single line anyway!  Words preceding
 *          or following this long word are placed on lines preceding
 *          or following the line with the long word.  Why this choice?
 *          Long "words" found in text documents are typically URLs, and
 *          it's often desirable not to put newlines in the middle of a URL.
 *          The text display program e.g., text editor will typically
 *          wrap the long "word" to fit in the window.
 * </pre>
 */
SARRAY *
sarrayConvertWordsToLines(SARRAY  *sa,
                          l_int32  linesize)
{
char    *wd, *strl;
char     emptystring[] = "";
l_int32  n, i, len, totlen;
SARRAY  *sal, *saout;

    PROCNAME("sarrayConvertWordsToLines");

    if (!sa)
        return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);

    saout = sarrayCreate(0);
    n = sarrayGetCount(sa);
    totlen = 0;
    sal = NULL;
    for (i = 0; i < n; i++) {
        if (!sal)
            sal = sarrayCreate(0);
        wd = sarrayGetString(sa, i, L_NOCOPY);
        len = strlen(wd);
        if (len == 0) {  /* end of paragraph: end line & insert blank line */
            if (totlen > 0) {
                strl = sarrayToString(sal, 2);
                sarrayAddString(saout, strl, L_INSERT);
            }
            sarrayAddString(saout, emptystring, L_COPY);
            sarrayDestroy(&sal);
            totlen = 0;
        } else if (totlen == 0 && len + 1 > linesize) {  /* long word! */
            sarrayAddString(saout, wd, L_COPY);  /* copy to one line */
        } else if (totlen + len + 1 > linesize) {  /* end line & start new */
            strl = sarrayToString(sal, 2);
            sarrayAddString(saout, strl, L_INSERT);
            sarrayDestroy(&sal);
            sal = sarrayCreate(0);
            sarrayAddString(sal, wd, L_COPY);
            totlen = len + 1;
        } else {  /* add to current line */
            sarrayAddString(sal, wd, L_COPY);
            totlen += len + 1;
        }
    }
    if (totlen > 0) {   /* didn't end with blank line; output last line */
        strl = sarrayToString(sal, 2);
        sarrayAddString(saout, strl, L_INSERT);
        sarrayDestroy(&sal);
    }

    return saout;
}


/*----------------------------------------------------------------------*
 *                    Split string on separator list                    *
 *----------------------------------------------------------------------*/
/*
 * \brief   sarraySplitString()
 *
 * \param[in]   sa            to append to; typically empty initially
 * \param[in]   str           string to split; not changed
 * \param[in]   separators    characters that split input string
 * \return   0 if OK, 1 on error.
 *
 * <pre>
 * Notes:
 *      (1) This uses strtokSafe().  See the notes there in utils.c.
 * </pre>
 */
l_int32
sarraySplitString(SARRAY      *sa,
                  const char  *str,
                  const char  *separators)
{
char  *cstr, *substr, *saveptr;

    PROCNAME("sarraySplitString");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!str)
        return ERROR_INT("str not defined", procName, 1);
    if (!separators)
        return ERROR_INT("separators not defined", procName, 1);

    cstr = stringNew(str);  /* preserves const-ness of input str */
    saveptr = NULL;
    substr = strtokSafe(cstr, separators, &saveptr);
    if (substr)
        sarrayAddString(sa, substr, L_INSERT);
    while ((substr = strtokSafe(NULL, separators, &saveptr)))
        sarrayAddString(sa, substr, L_INSERT);
    LEPT_FREE(cstr);

    return 0;
}


/*----------------------------------------------------------------------*
 *                              Filter sarray                           *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarraySelectBySubstring()
 *
 * \param[in]    sain     input sarray
 * \param[in]    substr   [optional] substring for matching; can be NULL
 * \return  saout output sarray, filtered with substring or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This selects all strings in sain that have substr as a substring.
 *          Note that we can't use strncmp() because we're looking for
 *          a match to the substring anywhere within each filename.
 *      (2) If substr == NULL, returns a copy of the sarray.
 * </pre>
 */
SARRAY *
sarraySelectBySubstring(SARRAY      *sain,
                        const char  *substr)
{
char    *str;
l_int32  n, i, offset, found;
SARRAY  *saout;

    PROCNAME("sarraySelectBySubstring");

    if (!sain)
        return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);

    n = sarrayGetCount(sain);
    if (!substr || n == 0)
        return sarrayCopy(sain);

    saout = sarrayCreate(n);
    for (i = 0; i < n; i++) {
        str = sarrayGetString(sain, i, L_NOCOPY);
        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
                          strlen(substr), &offset, &found);
        if (found)
            sarrayAddString(saout, str, L_COPY);
    }

    return saout;
}


/*!
 * \brief   sarraySelectRange()
 *
 * \param[in]    sain    input sarray
 * \param[in]    first   index of first string to be selected
 * \param[in]    last    index of last string to be selected;
 *                       use 0 to go to the end of the sarray
 * \return  saout   output sarray, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) This makes %saout consisting of copies of all strings in %sain
 *          in the index set [first ... last].  Use %last == 0 to get all
 *          strings from %first to the last string in the sarray.
 * </pre>
 */
SARRAY *
sarraySelectRange(SARRAY  *sain,
                  l_int32  first,
                  l_int32  last)
{
char    *str;
l_int32  n, i;
SARRAY  *saout;

    PROCNAME("sarraySelectRange");

    if (!sain)
        return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL);
    if (first < 0) first = 0;
    n = sarrayGetCount(sain);
    if (last <= 0) last = n - 1;
    if (last >= n) {
        L_WARNING("last > n - 1; setting to n - 1\n", procName);
        last = n - 1;
    }
    if (first > last)
        return (SARRAY *)ERROR_PTR("first must be >= last", procName, NULL);

    saout = sarrayCreate(0);
    for (i = first; i <= last; i++) {
        str = sarrayGetString(sain, i, L_COPY);
        sarrayAddString(saout, str, L_INSERT);
    }

    return saout;
}


/*!
 * \brief   sarrayParseRange()
 *
 * \param[in]    sa             input sarray
 * \param[in]    start          index to start range search
 * \param[out]   pactualstart   index of actual start; may be > 'start'
 * \param[out]   pend           index of end
 * \param[out]   pnewstart      index of start of next range
 * \param[in]    substr         substring for matching at beginning of string
 * \param[in]    loc            byte offset within the string for the pattern;
 *                              use -1 if the location does not matter.
 * \return  0 if valid range found; 1 otherwise
 *
 * <pre>
 * Notes:
 *      (1) This finds the range of the next set of strings in SA,
 *          beginning the search at 'start', that does NOT have
 *          the substring 'substr' either at the indicated location
 *          in the string or anywhere in the string.  The input
 *          variable 'loc' is the specified offset within the string;
 *          use -1 to indicate 'anywhere in the string'.
 *      (2) Always check the return value to verify that a valid range
 *          was found.
 *      (3) If a valid range is not found, the values of actstart,
 *          end and newstart are all set to the size of sa.
 *      (4) If this is the last valid range, newstart returns the value n.
 *          In use, this should be tested before calling the function.
 *      (5) Usage example.  To find all the valid ranges in a file
 *          where the invalid lines begin with two dashes, copy each
 *          line in the file to a string in an sarray, and do:
 *             start = 0;
 *             while (!sarrayParseRange(sa, start, &actstart, &end, &start,
 *                    "--", 0))
 *                 lept_stderr("start = %d, end = %d\n", actstart, end);
 * </pre>
 */
l_int32
sarrayParseRange(SARRAY      *sa,
                 l_int32      start,
                 l_int32     *pactualstart,
                 l_int32     *pend,
                 l_int32     *pnewstart,
                 const char  *substr,
                 l_int32      loc)
{
char    *str;
l_int32  n, i, offset, found;

    PROCNAME("sarrayParseRange");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);
    if (!pactualstart || !pend || !pnewstart)
        return ERROR_INT("not all range addresses defined", procName, 1);
    n = sarrayGetCount(sa);
    *pactualstart = *pend = *pnewstart = n;
    if (!substr)
        return ERROR_INT("substr not defined", procName, 1);

        /* Look for the first string without the marker */
    if (start < 0 || start >= n)
        return 1;
    for (i = start; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
                          strlen(substr), &offset, &found);
        if (loc < 0) {
            if (!found) break;
        } else {
            if (!found || offset != loc) break;
        }
    }
    start = i;
    if (i == n)  /* couldn't get started */
        return 1;

        /* Look for the last string without the marker */
    *pactualstart = start;
    for (i = start + 1; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
                          strlen(substr), &offset, &found);
        if (loc < 0) {
            if (found) break;
        } else {
            if (found && offset == loc) break;
        }
    }
    *pend = i - 1;
    start = i;
    if (i == n)  /* no further range */
        return 0;

        /* Look for the first string after *pend without the marker.
         * This will start the next run of strings, if it exists. */
    for (i = start; i < n; i++) {
        str = sarrayGetString(sa, i, L_NOCOPY);
        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
                          strlen(substr), &offset, &found);
        if (loc < 0) {
            if (!found) break;
        } else {
            if (!found || offset != loc) break;
        }
    }
    if (i < n)
        *pnewstart = i;

    return 0;
}


/*----------------------------------------------------------------------*
 *                           Serialize for I/O                          *
 *----------------------------------------------------------------------*/
/*!
 * \brief   sarrayRead()
 *
 * \param[in]    filename
 * \return  sarray, or NULL on error
 */
SARRAY *
sarrayRead(const char  *filename)
{
FILE    *fp;
SARRAY  *sa;

    PROCNAME("sarrayRead");

    if (!filename)
        return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL);

    if ((fp = fopenReadStream(filename)) == NULL)
        return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);
    sa = sarrayReadStream(fp);
    fclose(fp);
    if (!sa)
        return (SARRAY *)ERROR_PTR("sa not read", procName, NULL);
    return sa;
}


/*!
 * \brief   sarrayReadStream()
 *
 * \param[in]    fp    file stream
 * \return  sarray, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) We store the size of each string along with the string.
 *          The limit on the number of strings is 50M.
 *          The limit on the size of any string is 2^30 bytes.
 *      (2) This allows a string to have embedded newlines.  By reading
 *          the entire string, as determined by its size, we are
 *          not affected by any number of embedded newlines.
 *      (3) It is OK for the sarray to be empty.
 * </pre>
 */
SARRAY *
sarrayReadStream(FILE  *fp)
{
char    *stringbuf;
l_int32  i, n, size, index, bufsize, version, ignore, success;
SARRAY  *sa;

    PROCNAME("sarrayReadStream");

    if (!fp)
        return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL);

    if (fscanf(fp, "\nSarray Version %d\n", &version) != 1)
        return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL);
    if (version != SARRAY_VERSION_NUMBER)
        return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL);
    if (fscanf(fp, "Number of strings = %d\n", &n) != 1)
        return (SARRAY *)ERROR_PTR("error on # strings", procName, NULL);
    if (n < 0)
        return (SARRAY *)ERROR_PTR("num string ptrs <= 0", procName, NULL);
    if (n > MaxPtrArraySize)
        return (SARRAY *)ERROR_PTR("too many string ptrs", procName, NULL);
    if (n == 0) L_INFO("the sarray is empty\n", procName);

    success = TRUE;
    if ((sa = sarrayCreate(n)) == NULL)
        return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
    bufsize = 512 + 1;
    stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));

    for (i = 0; i < n; i++) {
            /* Get the size of the stored string */
        if ((fscanf(fp, "%d[%d]:", &index, &size) != 2) || (size > (1 << 30))) {
            success = FALSE;
            L_ERROR("error on string size\n", procName);
            goto cleanup;
        }
            /* Expand the string buffer if necessary */
        if (size > bufsize - 5) {
            LEPT_FREE(stringbuf);
            bufsize = (l_int32)(1.5 * size);
            stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
        }
            /* Read the stored string, plus leading spaces and trailing \n */
        if (fread(stringbuf, 1, size + 3, fp) != size + 3) {
            success = FALSE;
            L_ERROR("error reading string\n", procName);
            goto cleanup;
        }
            /* Remove the \n that was added by sarrayWriteStream() */
        stringbuf[size + 2] = '\0';
            /* Copy it in, skipping the 2 leading spaces */
        sarrayAddString(sa, stringbuf + 2, L_COPY);
    }
    ignore = fscanf(fp, "\n");

cleanup:
    LEPT_FREE(stringbuf);
    if (!success) sarrayDestroy(&sa);
    return sa;
}


/*!
 * \brief   sarrayReadMem()
 *
 * \param[in]    data    serialization in ascii
 * \param[in]    size    of data; can use strlen to get it
 * \return  sarray, or NULL on error
 */
SARRAY *
sarrayReadMem(const l_uint8  *data,
              size_t          size)
{
FILE    *fp;
SARRAY  *sa;

    PROCNAME("sarrayReadMem");

    if (!data)
        return (SARRAY *)ERROR_PTR("data not defined", procName, NULL);
    if ((fp = fopenReadFromMemory(data, size)) == NULL)
        return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL);

    sa = sarrayReadStream(fp);
    fclose(fp);
    if (!sa) L_ERROR("sarray not read\n", procName);
    return sa;
}


/*!
 * \brief   sarrayWrite()
 *
 * \param[in]    filename
 * \param[in]    sa          string array
 * \return  0 if OK; 1 on error
 */
l_ok
sarrayWrite(const char  *filename,
            SARRAY      *sa)
{
l_int32  ret;
FILE    *fp;

    PROCNAME("sarrayWrite");

    if (!filename)
        return ERROR_INT("filename not defined", procName, 1);
    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);

    if ((fp = fopenWriteStream(filename, "w")) == NULL)
        return ERROR_INT("stream not opened", procName, 1);
    ret = sarrayWriteStream(fp, sa);
    fclose(fp);
    if (ret)
        return ERROR_INT("sa not written to stream", procName, 1);
    return 0;
}


/*!
 * \brief   sarrayWriteStream()
 *
 * \param[in]    fp    file stream; use NULL to write to stderr
 * \param[in]    sa    string array
 * \return  0 if OK; 1 on error
 *
 * <pre>
 * Notes:
 *      (1) This appends a '\n' to each string, which is stripped
 *          off by sarrayReadStream().
 * </pre>
 */
l_ok
sarrayWriteStream(FILE    *fp,
                  SARRAY  *sa)
{
l_int32  i, n, len;

    PROCNAME("sarrayWriteStream");

    if (!fp)
        return ERROR_INT("stream not defined", procName, 1);
    if (!sa)
        return sarrayWriteStderr(sa);

    n = sarrayGetCount(sa);
    fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
    fprintf(fp, "Number of strings = %d\n", n);
    for (i = 0; i < n; i++) {
        len = strlen(sa->array[i]);
        fprintf(fp, "  %d[%d]:  %s\n", i, len, sa->array[i]);
    }
    fprintf(fp, "\n");

    return 0;
}


/*!
 * \brief   sarrayWriteStderr()
 *
 * \param[in]    sa    string array
 * \return  0 if OK; 1 on error
 */
l_ok
sarrayWriteStderr(SARRAY  *sa)
{
l_int32  i, n, len;

    PROCNAME("sarrayWriteStderr");

    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);

    n = sarrayGetCount(sa);
    lept_stderr("\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
    lept_stderr("Number of strings = %d\n", n);
    for (i = 0; i < n; i++) {
        len = strlen(sa->array[i]);
        lept_stderr("  %d[%d]:  %s\n", i, len, sa->array[i]);
    }
    lept_stderr("\n");
    return 0;
}


/*!
 * \brief   sarrayWriteMem()
 *
 * \param[out]   pdata    data of serialized sarray; ascii
 * \param[out]   psize    size of returned data
 * \param[in]    sa
 * \return  0 if OK, 1 on error
 *
 * <pre>
 * Notes:
 *      (1) Serializes a sarray in memory and puts the result in a buffer.
 * </pre>
 */
l_ok
sarrayWriteMem(l_uint8  **pdata,
               size_t    *psize,
               SARRAY    *sa)
{
l_int32  ret;
FILE    *fp;

    PROCNAME("sarrayWriteMem");

    if (pdata) *pdata = NULL;
    if (psize) *psize = 0;
    if (!pdata)
        return ERROR_INT("&data not defined", procName, 1);
    if (!psize)
        return ERROR_INT("&size not defined", procName, 1);
    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);

#if HAVE_FMEMOPEN
    if ((fp = open_memstream((char **)pdata, psize)) == NULL)
        return ERROR_INT("stream not opened", procName, 1);
    ret = sarrayWriteStream(fp, sa);
    fputc('\0', fp);
    fclose(fp);
    *psize = *psize - 1;
#else
    L_INFO("work-around: writing to a temp file\n", procName);
  #ifdef _WIN32
    if ((fp = fopenWriteWinTempfile()) == NULL)
        return ERROR_INT("tmpfile stream not opened", procName, 1);
  #else
    if ((fp = tmpfile()) == NULL)
        return ERROR_INT("tmpfile stream not opened", procName, 1);
  #endif  /* _WIN32 */
    ret = sarrayWriteStream(fp, sa);
    rewind(fp);
    *pdata = l_binaryReadStream(fp, psize);
    fclose(fp);
#endif  /* HAVE_FMEMOPEN */
    return ret;
}


/*!
 * \brief   sarrayAppend()
 *
 * \param[in]    filename
 * \param[in]    sa
 * \return  0 if OK; 1 on error
 */
l_ok
sarrayAppend(const char  *filename,
             SARRAY      *sa)
{
FILE  *fp;

    PROCNAME("sarrayAppend");

    if (!filename)
        return ERROR_INT("filename not defined", procName, 1);
    if (!sa)
        return ERROR_INT("sa not defined", procName, 1);

    if ((fp = fopenWriteStream(filename, "a")) == NULL)
        return ERROR_INT("stream not opened", procName, 1);
    if (sarrayWriteStream(fp, sa)) {
        fclose(fp);
        return ERROR_INT("sa not appended to stream", procName, 1);
    }

    fclose(fp);
    return 0;
}


/*---------------------------------------------------------------------*
 *                           Directory filenames                       *
 *---------------------------------------------------------------------*/
/*!
 * \brief   getNumberedPathnamesInDirectory()
 *
 * \param[in]    dirname   directory name
 * \param[in]    substr    [optional] substring filter on filenames; can be NULL
 * \param[in]    numpre    number of characters in name before number
 * \param[in]    numpost   number of characters in name after the number,
 *                         up to a dot before an extension
 * \param[in]    maxnum    only consider page numbers up to this value
 * \return  sarray of numbered pathnames, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Returns the full pathnames of the numbered filenames in
 *          the directory.  The number in the filename is the index
 *          into the sarray.  For indices for which there are no filenames,
 *          an empty string ("") is placed into the sarray.
 *          This makes reading numbered files very simple.  For example,
 *          the image whose filename includes number N can be retrieved using
 *               pixReadIndexed(sa, N);
 *      (2) If %substr is not NULL, only filenames that contain
 *          the substring can be included.  If %substr is NULL,
 *          all matching filenames are used.
 *      (3) If no numbered files are found, it returns an empty sarray,
 *          with no initialized strings.
 *      (4) It is assumed that the page number is contained within
 *          the basename (the filename without directory or extension).
 *          %numpre is the number of characters in the basename
 *          preceding the actual page number; %numpost is the number
 *          following the page number, up to either the end of the
 *          basename or a ".", whichever comes first.
 *      (5) This is useful when all filenames contain numbers that are
 *          not necessarily consecutive.  0-padding is not required.
 *      (6) To use a O(n) matching algorithm, the largest page number
 *          is found and two internal arrays of this size are created.
 *          This maximum is constrained not to exceed %maxsum,
 *          to make sure that an unrealistically large number is not
 *          accidentally used to determine the array sizes.
 * </pre>
 */
SARRAY *
getNumberedPathnamesInDirectory(const char  *dirname,
                                const char  *substr,
                                l_int32      numpre,
                                l_int32      numpost,
                                l_int32      maxnum)
{
l_int32  nfiles;
SARRAY  *sa, *saout;

    PROCNAME("getNumberedPathnamesInDirectory");

    if (!dirname)
        return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);

    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
        return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
    if ((nfiles = sarrayGetCount(sa)) == 0) {
        sarrayDestroy(&sa);
        return sarrayCreate(1);
    }

    saout = convertSortedToNumberedPathnames(sa, numpre, numpost, maxnum);
    sarrayDestroy(&sa);
    return saout;
}


/*!
 * \brief   getSortedPathnamesInDirectory()
 *
 * \param[in]    dirname   directory name
 * \param[in]    substr    [optional] substring filter on filenames; can be NULL
 * \param[in]    first     0-based
 * \param[in]    nfiles    use 0 for all to the end
 * \return  sarray of sorted pathnames, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Use %substr to filter filenames in the directory.  If
 *          %substr == NULL, this takes all files.
 *      (2) The files in the directory, after optional filtering by
 *          the substring, are lexically sorted in increasing order.
 *          Use %first and %nfiles to select a contiguous set of files.
 *      (3) The full pathnames are returned for the requested sequence.
 *          If no files are found after filtering, returns an empty sarray.
 * </pre>
 */
SARRAY *
getSortedPathnamesInDirectory(const char  *dirname,
                              const char  *substr,
                              l_int32      first,
                              l_int32      nfiles)
{
char    *fname, *fullname;
l_int32  i, n, last;
SARRAY  *sa, *safiles, *saout;

    PROCNAME("getSortedPathnamesInDirectory");

    if (!dirname)
        return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);

    if ((sa = getFilenamesInDirectory(dirname)) == NULL)
        return (SARRAY *)ERROR_PTR("sa not made", procName, NULL);
    safiles = sarraySelectBySubstring(sa, substr);
    sarrayDestroy(&sa);
    n = sarrayGetCount(safiles);
    if (n == 0) {
        L_WARNING("no files found\n", procName);
        return safiles;
    }

    sarraySort(safiles, safiles, L_SORT_INCREASING);

    first = L_MIN(L_MAX(first, 0), n - 1);
    if (nfiles == 0)
        nfiles = n - first;
    last = L_MIN(first + nfiles - 1, n - 1);

    saout = sarrayCreate(last - first + 1);
    for (i = first; i <= last; i++) {
        fname = sarrayGetString(safiles, i, L_NOCOPY);
        fullname = pathJoin(dirname, fname);
        sarrayAddString(saout, fullname, L_INSERT);
    }

    sarrayDestroy(&safiles);
    return saout;
}


/*!
 * \brief   convertSortedToNumberedPathnames()
 *
 * \param[in]    sa        sorted pathnames including zero-padded integers
 * \param[in]    numpre    number of characters in name before number
 * \param[in]    numpost   number of characters in name after the number,
 *                         up to a dot before an extension
 * \param[in]    maxnum    only consider page numbers up to this value
 * \return  sarray of numbered pathnames, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) Typically, numpre = numpost = 0; e.g., when the filename
 *          just has a number followed by an optional extension.
 * </pre>
 */
SARRAY *
convertSortedToNumberedPathnames(SARRAY   *sa,
                                 l_int32   numpre,
                                 l_int32   numpost,
                                 l_int32   maxnum)
{
char    *fname, *str;
l_int32  i, nfiles, num, index;
SARRAY  *saout;

    PROCNAME("convertSortedToNumberedPathnames");

    if (!sa)
        return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL);
    if ((nfiles = sarrayGetCount(sa)) == 0)
        return sarrayCreate(1);

        /* Find the last file in the sorted array that has a number
         * that (a) matches the count pattern and (b) does not
         * exceed %maxnum.  %maxnum sets an upper limit on the size
         * of the sarray.  */
    num = 0;
    for (i = nfiles - 1; i >= 0; i--) {
        fname = sarrayGetString(sa, i, L_NOCOPY);
        num = extractNumberFromFilename(fname, numpre, numpost);
        if (num < 0) continue;
        num = L_MIN(num + 1, maxnum);
        break;
    }

    if (num <= 0)  /* none found */
        return sarrayCreate(1);

        /* Insert pathnames into the output sarray.
         * Ignore numbers that are out of the range of sarray. */
    saout = sarrayCreateInitialized(num, "");
    for (i = 0; i < nfiles; i++) {
        fname = sarrayGetString(sa, i, L_NOCOPY);
        index = extractNumberFromFilename(fname, numpre, numpost);
        if (index < 0 || index >= num) continue;
        str = sarrayGetString(saout, index, L_NOCOPY);
        if (str[0] != '\0') {
            L_WARNING("\n  Multiple files with same number: %d\n",
                      procName, index);
        }
        sarrayReplaceString(saout, index, fname, L_COPY);
    }

    return saout;
}


/*!
 * \brief   getFilenamesInDirectory()
 *
 * \param[in]    dirname     directory name
 * \return  sarray of file names, or NULL on error
 *
 * <pre>
 * Notes:
 *      (1) The versions compiled under unix and cygwin use the POSIX C
 *          library commands for handling directories.  For windows,
 *          there is a separate implementation.
 *      (2) It returns an array of filename tails; i.e., only the part of
 *          the path after the last slash.
 *      (3) Use of the d_type field of dirent is not portable:
 *          "According to POSIX, the dirent structure contains a field
 *          char d_name[] of unspecified size, with at most NAME_MAX
 *          characters preceding the terminating null character.  Use
 *          of other fields will harm the portability of your programs."
 *      (4) As a consequence of (3), we note several things:
 *           ~ MINGW doesn't have a d_type member.
 *           ~ Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
 *             for d_type from all files.
 *          On these systems, this function will return directories
 *          (except for '.' and '..', which are eliminated using
 *          the d_name field).
 * </pre>
 */

#ifndef _WIN32

SARRAY *
getFilenamesInDirectory(const char  *dirname)
{
char            dir[PATH_MAX + 1];
char           *realdir, *stat_path, *ignore;
size_t          size;
SARRAY         *safiles;
DIR            *pdir;
struct dirent  *pdirentry;
int             dfd, stat_ret;
struct stat     st;

    PROCNAME("getFilenamesInDirectory");

    if (!dirname)
        return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);
    if (dirname[0] == '\0')
        return (SARRAY *)ERROR_PTR("dirname is empty", procName, NULL);

        /* Who would have thought it was this fiddly to open a directory
           and get the files inside?  fstatat() works with relative
           directory paths, and stat() requires using the absolute path.
           realpath works as follows for files and directories:
            * If the file or directory exists, realpath returns its path;
              else it returns NULL.
            * If the second arg to realpath is passed in, the canonical path
              is returned there.  Use a buffer of sufficient size.  If the
              second arg is NULL, the path is malloc'd and returned if the
              file or directory exists.
           We pass in a buffer for the second arg, and check that the canonical
           directory path was made.  The existence of the directory is checked
           later, after its actual path is returned by genPathname().  */
    dir[0] = '\0';  /* init empty in case realpath() fails to write it */
    ignore = realpath(dirname, dir);
    if (dir[0] == '\0')
        return (SARRAY *)ERROR_PTR("dir not made", procName, NULL);
    realdir = genPathname(dir, NULL);
    if ((pdir = opendir(realdir)) == NULL) {
        LEPT_FREE(realdir);
        return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL);
    }
    safiles = sarrayCreate(0);
    while ((pdirentry = readdir(pdir))) {
#if HAVE_DIRFD && HAVE_FSTATAT
            /* Platform issues: although Linux has these POSIX functions,
             * AIX doesn't have fstatat() and Solaris doesn't have dirfd(). */
        dfd = dirfd(pdir);
        stat_ret = fstatat(dfd, pdirentry->d_name, &st, 0);
#else
        size = strlen(realdir) + strlen(pdirentry->d_name) + 2;
        if (size > PATH_MAX) {
            L_ERROR("size = %zu too large; skipping\n", procName, size);
            continue;
        }
        stat_path = (char *)LEPT_CALLOC(size, 1);
        snprintf(stat_path, size, "%s/%s", realdir, pdirentry->d_name);
        stat_ret = stat(stat_path, &st);
        LEPT_FREE(stat_path);
#endif
        if (stat_ret == 0 && S_ISDIR(st.st_mode))
            continue;
        sarrayAddString(safiles, pdirentry->d_name, L_COPY);
    }
    closedir(pdir);
    LEPT_FREE(realdir);
    return safiles;
}

#else  /* _WIN32 */

    /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
#include <windows.h>

SARRAY *
getFilenamesInDirectory(const char  *dirname)
{
char             *pszDir;
char             *realdir;
HANDLE            hFind = INVALID_HANDLE_VALUE;
SARRAY           *safiles;
WIN32_FIND_DATAA  ffd;

    PROCNAME("getFilenamesInDirectory");

    if (!dirname)
        return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL);

    realdir = genPathname(dirname, NULL);
    pszDir = stringJoin(realdir, "\\*");
    LEPT_FREE(realdir);

    if (strlen(pszDir) + 1 > MAX_PATH) {
        LEPT_FREE(pszDir);
        return (SARRAY *)ERROR_PTR("dirname is too long", procName, NULL);
    }

    if ((safiles = sarrayCreate(0)) == NULL) {
        LEPT_FREE(pszDir);
        return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL);
    }

    hFind = FindFirstFileA(pszDir, &ffd);
    if (INVALID_HANDLE_VALUE == hFind) {
        sarrayDestroy(&safiles);
        LEPT_FREE(pszDir);
        return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL);
    }

    while (FindNextFileA(hFind, &ffd) != 0) {
        if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)  /* skip dirs */
            continue;
        convertSepCharsInPath(ffd.cFileName, UNIX_PATH_SEPCHAR);
        sarrayAddString(safiles, ffd.cFileName, L_COPY);
    }

    FindClose(hFind);
    LEPT_FREE(pszDir);
    return safiles;
}
#endif  /* _WIN32 */
